borovecki <- borovecki |>
mutate(
gene = mapIds(
hgu133a.db,
keys=probeID,
keytype = "PROBEID",
column="SYMBOL",
multiVals="first"
)
) |>
filter(!is.na(gene)) |>
dplyr::select(-probeID)
borovecki_geneLevel <- borovecki |>
group_by(sample, gene) |>
summarise(expression = mean(expression), .groups = "drop")raw_histogram <- ggplot(borovecki_data, aes(x = expressionValue,fill = "Raw data")) +
geom_histogram(bins = 50, alpha = 0.6) +
fill_scale+
theme_minimal(base_size = 14) +
labs(x = "Expression value",
y = "Count") +
theme(plot.title = element_text(size = 13,
hjust = 0.5))
log2_histogram <- ggplot(borovecki_data, aes(x = log2_expression, fill = "Log2 data")) +
geom_histogram(bins = 100, alpha = 0.6) +
fill_scale +
theme_minimal(base_size = 14) +
labs(x = "log2(Expression value)",
y = "Count") +
theme(plot.title = element_text(size = 13,
hjust = 0.5))
raw_histogram + log2_histogram +
plot_layout(guides = "collect")+
plot_annotation(title="Comparison between Raw and Log2-Transformed Expression Data",
caption="Data from: Borovecki et al. 2005") &
theme(legend.position = "right")borovecki_tibble <- borovecki_tibble |>
pivot_wider(
# Columns not to pivot
id_cols = c(sample, diagnosis),
# Columns to pivot
names_from = gene,
values_from = expressionValue
) |>
mutate(
Diagnosis = case_when(
diagnosis == "symptomatic" ~ 1,
diagnosis == "control" ~ 0
)
) |>
dplyr::select(-c(diagnosis,sample)) |>
relocate(Diagnosis, .before = 1)
borovecki_tibble_long <- borovecki_tibble |>
pivot_longer(
cols = -Diagnosis,
names_to = "gene",
values_to = "log2_expr_level"
) |>
mutate(
log2_expr_level = log2(log2_expr_level)
)
borovecki_tibble_long_nested <- borovecki_tibble_long |>
group_by(gene) |>
nest() |>
ungroup()by_gene <- borovecki_tibble_long_nested |>
group_by(gene)
linear_model <- function(df) {
lm(
log2_expr_level ~ Diagnosis ,
data = df)
}
borovecki_tibble_long_nested <- borovecki_tibble_long_nested |>
mutate(model_object = map(data, linear_model))
tidy_data <- function(df) {
df |>
tidy(conf.int = TRUE,
conf.level = 0.95)
}
borovecki_estimates <- borovecki_tibble_long_nested |>
mutate(model_object_tidy = map(model_object, tidy_data)) |>
unnest(model_object_tidy) |>
filter(term == "Diagnosis") |>
dplyr::select(gene, p.value, estimate, conf.low, conf.high) |>
mutate(q.value=p.adjust(p.value),
is_significant = case_when(
q.value >= 0.05 ~ "no",
q.value < 0.05 ~ "yes"
) Results
Collaboration between scientists and bioinformaticians allows for a better understanding of how diseases work